{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "[](https://mybinder.org/v2/gh/pyinat/pyinaturalist/main?filepath=examples%2FData%2520Visualizations%2520-%2520Seaborn.ipynb)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Seaborn examples\n", "Here are some examples of visualizations that can be created using [Seaborn](https://seaborn.pydata.org/)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "from datetime import datetime\n", "from pprint import pprint\n", "\n", "import matplotlib as mpl\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "from dateutil import tz\n", "from matplotlib import dates\n", "from matplotlib import pyplot as plt\n", "\n", "from pyinaturalist import iNatClient\n", "from pyinaturalist import pprint as inat_pprint\n", "\n", "BASIC_OBS_COLUMNS = [\n", " 'id',\n", " 'observed_on',\n", " 'location',\n", " 'uri',\n", " 'taxon.id',\n", " 'taxon.name',\n", " 'taxon.rank',\n", " 'taxon.preferred_common_name',\n", " 'user.login',\n", "]\n", "DATASET_FILENAME = 'midwest_monarchs.json'\n", "PLOT_COLOR = '#fa7b23'\n", "MIDWEST_STATE_IDS = [3, 20, 24, 25, 28, 32, 35, 38] # place_ids of 8 states in the Midwest US\n", "\n", "sns.set_theme(style='darkgrid')\n", "\n", "# Create a client for API requests\n", "client = iNatClient()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def date_to_mpl_day_of_year(dt):\n", " \"\"\"Get a matplotlib-compatible date number, ignoring the year (to represent day of year)\"\"\"\n", " try:\n", " return dates.date2num(dt.replace(year=datetime.now().year))\n", " except ValueError:\n", " return None\n", "\n", "\n", "def date_to_mpl_time(dt):\n", " \"\"\"Get a matplotlib-compatible date number, ignoring the date (to represent time of day)\"\"\"\n", " try:\n", " return dates.date2num(dt) % 1\n", " except ValueError:\n", " return None\n", "\n", "\n", "def to_local_tz(dt):\n", " \"\"\"Convert a datetime object to the local time zone\"\"\"\n", " try:\n", " return dt.astimezone(tz.tzlocal())\n", " except (TypeError, ValueError):\n", " return None\n", "\n", "\n", "def get_xlim():\n", " \"\"\"Get limits of x axis for first and last days of the year\"\"\"\n", " now = datetime.now()\n", " xmin = dates.date2num(datetime(now.year, 1, 1))\n", " xmax = dates.date2num(datetime(now.year, 12, 31))\n", " return xmin, xmax\n", "\n", "\n", "def get_colormap(color):\n", " \"\"\"Make a colormap (gradient) based on the given color; copied from seaborn.axisgrid\"\"\"\n", " color_rgb = mpl.colors.colorConverter.to_rgb(color)\n", " colors = [sns.set_hls_values(color_rgb, l=l) for l in np.linspace(1, 0, 12)]\n", " return sns.blend_palette(colors, as_cmap=True)\n", "\n", "\n", "def pdir(obj, sort_types=False, non_callables=False):\n", " attrs = {attr: type(getattr(obj, attr)).__name__ for attr in dir(obj)}\n", " if sort_types:\n", " attrs = dict(sorted(attrs.items(), key=lambda x: x[1]))\n", " if non_callables:\n", " attrs = {\n", " k: v\n", " for k, v in attrs.items()\n", " if v not in ['function', 'method', 'method-wrapper', 'builtin_function_or_method']\n", " }\n", " pprint(attrs, sort_dicts=not sort_types)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Get all observations for a given place and species" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
{\n", " 24: 'Iowa',\n", " 1911: 'Iowa',\n", " 2840: 'Iowa',\n", " 8680: 'Iowa City',\n", " 136739: 'Eastern Iowa and Minnesota Drift Plains (US EPA Level IV Ecoregion)',\n", " 119385: 'Iowa Wetland Management District',\n", " 208908: 'Big Sioux River Corridor South Dakota and Iowa',\n", " 161392: 'Upper Iowa River Wildlife Management Areas',\n", " 221041: 'Iowa KS-NE Reservation',\n", " 207525: 'University of Northern Iowa',\n", " 137891: 'Pammel State Park, Winterset, Iowa',\n", " 125537: 'Terry Trueblood Wetland Exploration Trail',\n", " 172799: 'Ashton Prairie',\n", " 174271: 'Des Moines Iowa Park',\n", " 151098: 'Mount Vernon, Iowa walking path'\n", "}\n", "\n" ], "text/plain": [ "\u001b[1m{\u001b[0m\n", " \u001b[1;36m24\u001b[0m: \u001b[32m'Iowa'\u001b[0m,\n", " \u001b[1;36m1911\u001b[0m: \u001b[32m'Iowa'\u001b[0m,\n", " \u001b[1;36m2840\u001b[0m: \u001b[32m'Iowa'\u001b[0m,\n", " \u001b[1;36m8680\u001b[0m: \u001b[32m'Iowa City'\u001b[0m,\n", " \u001b[1;36m136739\u001b[0m: \u001b[32m'Eastern Iowa and Minnesota Drift Plains \u001b[0m\u001b[32m(\u001b[0m\u001b[32mUS EPA Level IV Ecoregion\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m,\n", " \u001b[1;36m119385\u001b[0m: \u001b[32m'Iowa Wetland Management District'\u001b[0m,\n", " \u001b[1;36m208908\u001b[0m: \u001b[32m'Big Sioux River Corridor South Dakota and Iowa'\u001b[0m,\n", " \u001b[1;36m161392\u001b[0m: \u001b[32m'Upper Iowa River Wildlife Management Areas'\u001b[0m,\n", " \u001b[1;36m221041\u001b[0m: \u001b[32m'Iowa KS-NE Reservation'\u001b[0m,\n", " \u001b[1;36m207525\u001b[0m: \u001b[32m'University of Northern Iowa'\u001b[0m,\n", " \u001b[1;36m137891\u001b[0m: \u001b[32m'Pammel State Park, Winterset, Iowa'\u001b[0m,\n", " \u001b[1;36m125537\u001b[0m: \u001b[32m'Terry Trueblood Wetland Exploration Trail'\u001b[0m,\n", " \u001b[1;36m172799\u001b[0m: \u001b[32m'Ashton Prairie'\u001b[0m,\n", " \u001b[1;36m174271\u001b[0m: \u001b[32m'Des Moines Iowa Park'\u001b[0m,\n", " \u001b[1;36m151098\u001b[0m: \u001b[32m'Mount Vernon, Iowa walking path'\u001b[0m\n", "\u001b[1m}\u001b[0m\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Optional: search for a place ID by name\n", "places = client.places.autocomplete(q='iowa').all()\n", "inat_pprint({p.id: p.name for p in places})" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total observations: 2195\n" ] } ], "source": [ "# Optional: reload from previously loaded results, if available\n", "# if exists(DATASET_FILENAME):\n", "# with open(DATASET_FILENAME) as f:\n", "# observations = Observation.from_json_file(f)\n", "# else:\n", "\n", "observations = client.observations.search(\n", " taxon_name='Danaus plexippus',\n", " photos=True,\n", " geo=True,\n", " geoprivacy='open',\n", " place_id=24, # Iowa\n", ").all()\n", "\n", "# Save results for future usage (convert to JSON format for compatibility)\n", "observations_json = [obs.to_dict() for obs in observations]\n", "with open(DATASET_FILENAME, 'w') as f:\n", " json.dump(observations_json, f, indent=4, sort_keys=True, default=str)\n", "\n", "print(f'Total observations: {len(observations)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data cleanup" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Convert observations to DataFrame\n", "df = pd.DataFrame(\n", " [\n", " {\n", " 'id': obs.id,\n", " 'observed_on': obs.observed_on,\n", " 'quality_grade': obs.quality_grade,\n", " 'taxon.id': obs.taxon.id if obs.taxon else None,\n", " 'taxon.name': obs.taxon.name if obs.taxon else None,\n", " 'taxon.rank': obs.taxon.rank if obs.taxon else None,\n", " 'taxon.preferred_common_name': obs.taxon.preferred_common_name if obs.taxon else None,\n", " 'user.login': obs.user.login if obs.user else None,\n", " }\n", " for obs in observations\n", " ]\n", ")\n", "\n", "# Normalize timezones\n", "df['observed_on'] = df['observed_on'].dropna().apply(to_local_tz)\n", "\n", "# Add some extra date/time columns that matplotlib can more easily handle\n", "df['observed_time_mp'] = df['observed_on'].apply(date_to_mpl_time)\n", "df['observed_on_mp'] = df['observed_on'].apply(date_to_mpl_day_of_year)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Optional: narrow down to just a few columns of interest\n", "# pprint(list(sorted(df.columns)))\n", "# df = df[OBS_COLUMNS]\n", "\n", "# Optional: Hacky way of setting limits by adding outliers\n", "# JointGrid + hexbin doesn't make it easy to do this the 'right' way without distorting the plot\n", "# df2 = pd.DataFrame([\n", "# {'observed_on': datetime(2020, 1, 1, 0, 0, 0, tzinfo=tz.tzlocal()), 'quality_grade': 'research'},\n", "# {'observed_on': datetime(2020, 12, 31, 23, 59, 59, tzinfo=tz.tzlocal()), 'quality_grade': 'research'},\n", "# ])\n", "# df = df.append(df2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic seasonality plot: observation counts by month & quality grade" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
| \n", " | observed_month | \n", "quality_grade | \n", "counts | \n", "
|---|---|---|---|
| 0 | \n", "3.0 | \n", "research | \n", "1 | \n", "
| 1 | \n", "4.0 | \n", "research | \n", "4 | \n", "
| 2 | \n", "5.0 | \n", "research | \n", "67 | \n", "
| 3 | \n", "6.0 | \n", "casual | \n", "2 | \n", "
| 4 | \n", "6.0 | \n", "needs_id | \n", "1 | \n", "
| 5 | \n", "6.0 | \n", "research | \n", "214 | \n", "
| 6 | \n", "7.0 | \n", "casual | \n", "8 | \n", "
| 7 | \n", "7.0 | \n", "research | \n", "487 | \n", "
| 8 | \n", "8.0 | \n", "casual | \n", "6 | \n", "
| 9 | \n", "8.0 | \n", "research | \n", "658 | \n", "
| 10 | \n", "9.0 | \n", "casual | \n", "8 | \n", "
| 11 | \n", "9.0 | \n", "research | \n", "623 | \n", "
| 12 | \n", "10.0 | \n", "research | \n", "109 | \n", "
| 13 | \n", "11.0 | \n", "casual | \n", "2 | \n", "
| 14 | \n", "12.0 | \n", "research | \n", "1 | \n", "